In [28]:
    
%matplotlib
    
    
In [2]:
    
import sys
import pandas as pa
import numpy as np
from primetext import primetext
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
from autocorrect import spell
st = LancasterStemmer()
    
In [3]:
    
pt = primetext()
ytData = pa.read_csv("utubelabled.csv",encoding ='ISO-8859-1')
comments = ytData['comment']
    
In [4]:
    
comments = comments.str.replace('','')
    
In [5]:
    
def cleanData(records,labels):
    output = []
    outputLabels = []
    recordsChecked = 0
    recordsToCheck = len(records)
    for index,sentence in enumerate(records):
        recordsChecked += 1
        sys.stdout.write("\rRecords cleaned : %i / %i" % (recordsChecked,recordsToCheck))
        cleanSentence = ''
        if len(sentence) < 200:
            words = sentence.split(' ')
            for word in words:
                if len(word) < 12:
                    if word.isalpha():
                        cleanSentence += st.stem(spell(word.lower())) + ' '
        if cleanSentence:
            output.append(cleanSentence.strip())  
            outputLabels.append(labels[index])
    sys.stdout.write("\n")
    sys.stdout.flush()
    return output,outputLabels
    
In [6]:
    
[cleanedRecords, cleanedLabels] = cleanData(comments,ytData['troll'])
    
    
In [7]:
    
pt.index(cleanedRecords)
    
    
In [8]:
    
keyText = []
keyCount = []
for key, value in pt.indexedDictionary.items():
    c = pt.countInRecords([key])
    keyText.append(key)
    keyCount.append(c)
    
s1 = pa.Series(keyCount,index=keyText)
sortedS1  = s1.sort_values(ascending= False)[:50]
sortedS1.plot.bar()
    
    Out[8]:
    
In [9]:
    
df = pa.DataFrame(index=sortedS1.index, columns=sortedS1.index)
df = df.fillna(0)
    
In [10]:
    
names = sortedS1.index
colsdone = 0
for col in names:
    colsdone += 1
    sys.stdout.write("\rCols done : %i" % colsdone)
    for row in names:
        df[col][row] = pt.countInRecords([col,row])
sys.stdout.write("\n")
sys.stdout.flush()
    
    
In [11]:
    
imgplot = plt.imshow(df,interpolation="nearest")
plt.xticks( range(len(names)), names, rotation=90 )
plt.yticks( range(len(names)), names, rotation=0 )
plt.colorbar()
plt.show()
    
    
In [12]:
    
myLabels = pa.Series(cleanedLabels)
myLabels.sum()
    
    Out[12]:
In [13]:
    
totalComments = myLabels.count()
totalTrollComments = myLabels.sum()
trollWeight = (totalComments-totalTrollComments)/totalComments
nonTrollWeight = totalTrollComments/totalComments
trollWeight, nonTrollWeight
    
    Out[13]:
In [14]:
    
# for each troll comment add the troll weight to each word
# for each non troll comment minus the nonTrollWeight from each word
    
In [15]:
    
len(pt.cleanedDictionary)
    
    Out[15]:
In [16]:
    
totalFoundTrolling = myLabels[pt.find(['the'])].sum()
totalFoundTrolling
    
    Out[16]:
In [17]:
    
totalFoundNotTrolling = pt.find(['the']).sum() - totalFoundTrolling
totalFoundNotTrolling
    
    Out[17]:
In [18]:
    
trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
trollScore
    
    Out[18]:
In [19]:
    
trollScores = []
for word in pt.cleanedDictionary:
    totalFoundTrolling = myLabels[pt.find([word])].sum()
    totalFoundNotTrolling = pt.find([word]).sum() - totalFoundTrolling
    trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
    trollScores.append(trollScore)
    
In [20]:
    
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)
sortedPos  = s2.sort_values(ascending= True)[:100]
sortedPos.plot.bar()
print(sortedPos)
    
    
    
In [29]:
    
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)
sortedNeg  = s2.sort_values(ascending= False)[:100]
sortedNeg.plot.bar()
    
    Out[29]:
In [ ]:
    
    
In [22]:
    
def calTrollScore(comment):
    words = str(comment).split(' ')
    score = 0
    for word in words:
        if word in s2:
            score += s2[word]
    return score
    
In [23]:
    
for i in range(10):
    print(cleanedRecords[i],calTrollScore(cleanedRecords[i]))
    
    
In [24]:
    
def predictTroll(comment,theta):
    return calTrollScore(comment) > theta[0]
def costTrollPredict(theta):
    result = list(map(lambda c:predictTroll(c,theta),cleanedRecords))
    
In [ ]:
    
    
In [25]:
    
pred = costTrollPredict([30])
trueVal = list(map(lambda v: v==1.0,cleanedLabels))
    
In [26]:
    
from sklearn.metrics import f1_score
    
In [27]:
    
f1_score(trueVal, pred)
    
    
In [ ]:
    
vals = []
for i in range(-30,70):
    predt = costTrollPredict([i])
    cost = f1_score(trueVal, predt)
    vals.append(cost)
plt.plot(vals)
    
In [ ]:
    
np.asarray(costTrollPredict([50])).sum()
    
In [ ]:
    
output = np.asarray(trueVal)
    
In [ ]:
    
pred = np.asarray(costTrollPredict([0]))
    
In [ ]:
    
output.sum()
    
In [ ]:
    
len(pred)
    
In [ ]:
    
output[pred].sum()
    
In [ ]:
    
pred
    
In [ ]:
    
def calculateEffect(predFunc):
    plotLog = []
    plotFalse = []
    plotx = []
    for i in range(-100,100,5):
        pred = np.asarray(predFunc([i]))
        trollsFound = output[pred].sum()
        falsePos = (output == False)[pred].sum()
        falsePosPc =  ((100/(output==False).sum())*falsePos)  
        plotLog.append((100/output.sum())*trollsFound)
        plotFalse.append(falsePosPc)
        plotx.append(i)
    return plotLog,plotFalse,plotx
[plotLog,plotFalse,plotx] = calculateEffect(costTrollPredict)
    
In [ ]:
    
plotDiff = list(map(lambda a,b: a-b,plotLog,plotFalse ))
plt.title('Plot of % true positives against false positives (Trolls caught)')
plt.plot(plotx,plotLog,c='g')
plt.plot(plotx,plotFalse,c='r')
plt.plot(plotx,plotDiff,c='b')
plt.axvline(0,linestyle = 'dashed')
    
In [ ]:
    
usedNeg = sortedNeg[:20]
usedPos = sortedPos[:20]
def calTrollScoreSim(comment):
    words = str(comment).split(' ')
    score = 0
    for word in words:
        if word in usedNeg:
            score += usedNeg[word]
        elif word in usedPos:
            score += usedPos[word]
    return score
    
In [ ]:
    
def predictTroll2(comment,theta):
    return calTrollScoreSim(comment) > theta[0]
def costTrollPredict2(theta):
    result = list(map(lambda c:predictTroll2(c,theta),cleanedRecords))
    return result
    
In [ ]:
    
costTrollPredict2([0])
    
In [ ]:
    
vals = []
for i in range(0,140,5):
    predt = costTrollPredict2([i])
    cost = f1_score(trueVal, predt)
    vals.append(cost)
plt.plot(vals)
    
In [ ]:
    
[plotLog2,plotFalse2,plotx2] = calculateEffect(costTrollPredict2)
    
In [ ]:
    
plotDiff2 = list(map(lambda a,b: a-b,plotLog2,plotFalse2 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top 100 polarizing')
plt.plot(plotx2,plotLog2,c='g')
plt.plot(plotx2,plotFalse2,c='r')
plt.plot(plotx2,plotDiff2,c='b')
plt.axvline(0,linestyle = 'dashed')
    
In [ ]:
    
def calculateEffectAt0(predFunc):
    global usedNeg
    global usedPos
    plotLog = []
    plotFalse = []
    plotx = []
    for i in range(1,200,5):
        usedNeg = sortedNeg[:i]
        usedPos = sortedPos[:i]
        pred = np.asarray(predFunc([0]))
        trollsFound = output[pred].sum()
        falsePos = (output == False)[pred].sum()
        falsePosPc =  ((100/(output==False).sum())*falsePos)  
        plotLog.append((100/output.sum())*trollsFound)
        plotFalse.append(falsePosPc)
        plotx.append(i)
    return plotLog,plotFalse,plotx
    
In [ ]:
    
[plotLog3,plotFalse3,plotx3] = calculateEffectAt0(costTrollPredict2)
    
In [ ]:
    
plotDiff3 = list(map(lambda a,b: a-b,plotLog3,plotFalse3 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top x polarizing')
plt.plot(plotx3,plotLog3,c='g')
plt.plot(plotx3,plotFalse3,c='r')
plt.plot(plotx3,plotDiff3,c='b')
plt.axvline(0,linestyle = 'dashed')
    
In [ ]: